Predict sales prices and practice feature engineering, RFs, and gradient boosting
library(ggplot2)
PROJ_PATH <- '~/Documents/kaggle/house_prices'
train <- read.csv(file.path(PROJ_PATH, 'data/train.csv'))
test <- read.csv(file.path(PROJ_PATH, 'data/test.csv'))
nm <- names(train)
# Numeric variable indices
num_idx <- c(4,5,18,19,20,21,27,35,37,38,39,44,45,46,47,48,49,50,51,
52,53,55,57,60,62,63,67,68,69,70,71,72,76,77,78)
# Categorical variable indices
cat_idx <- setdiff(2:(ncol(train)-1), num_idx)
hist2 <- function(..., breaks=30, col='darkgray', xlab=NULL){
hist(..., breaks=breaks, col=col, border=col, xlab=xlab)
}
barplot2 <- function(..., col='darkgray', xlab=NULL){
barplot(..., col=col, border=col, horiz=TRUE)
}
plot2 <- function(..., col=adjustcolor('gray30', alpha.f=0.2), bty='n'){
plot(..., col=col, bty=bty)
}
Data description
dim(train)
## [1] 1460 81
names(train)
## [1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
## [5] "LotArea" "Street" "Alley" "LotShape"
## [9] "LandContour" "Utilities" "LotConfig" "LandSlope"
## [13] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [17] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
## [21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st"
## [25] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [29] "ExterCond" "Foundation" "BsmtQual" "BsmtCond"
## [33] "BsmtExposure" "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2"
## [37] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
## [41] "HeatingQC" "CentralAir" "Electrical" "X1stFlrSF"
## [45] "X2ndFlrSF" "LowQualFinSF" "GrLivArea" "BsmtFullBath"
## [49] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr"
## [53] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Functional"
## [57] "Fireplaces" "FireplaceQu" "GarageType" "GarageYrBlt"
## [61] "GarageFinish" "GarageCars" "GarageArea" "GarageQual"
## [65] "GarageCond" "PavedDrive" "WoodDeckSF" "OpenPorchSF"
## [69] "EnclosedPorch" "X3SsnPorch" "ScreenPorch" "PoolArea"
## [73] "PoolQC" "Fence" "MiscFeature" "MiscVal"
## [77] "MoSold" "YrSold" "SaleType" "SaleCondition"
## [81] "SalePrice"
Histograms of numeric variables
par(mar=c(3,3,3,3))
par(mfrow = c(1, 4))
for (ni in num_idx){
hist2(train[[ni]], main=nm[ni])
}
hist2(train[['SalePrice']], main='SalePrice')
Frequencies of categorical variables
par(las=2)
par(mar=c(3,4,3,3))
par(mfrow = c(1, 4))
for (ci in cat_idx){
barplot2(table(train[[ci]]), main=nm[ci])
}
Missingness of variables (only those with any values missing)
missing_perc <- sort(sapply(train, function(x) sum(is.na(x)) / length(x)))
par(las=2)
par(mar=c(3,7,2,2))
barplot2(missing_perc[missing_perc>0],
cex.names=0.6, cex.axis=0.6, xlim=c(0,1),
main='Missingness')
Relation of each variable to price
par(mar=c(3,3,3,3))
par(mfrow = c(1, 4))
for (ni in num_idx){
plot2(train[,c(nm[ni], 'SalePrice'),],
main=nm[ni], ylim=range(train$SalePrice))
}
par(mar=c(3,3,3,3))
par(mfrow = c(1, 4))
for (ni in cat_idx){
plot2(train[,c(nm[ni], 'SalePrice'),],
main=nm[ni], ylim=range(train$SalePrice))
}